PCA: Group 03

Load data

First the data is gathered from the different websites.

Load libraries

library(RCurl) 
library(jsonlite) 
library(dplyr) 

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(here)
here() starts at /Users/AK/Library/Mobile Documents/com~apple~CloudDocs/Skrivebord/STUDIE/3. studieår/R for Bio Data Science/group_03_project

Create data directory

# Create 'data' directory if it doesn't exist

if (!dir.exists(here("data"))) {dir.create(here("data"))  }

# Create 'raw' directory if it doesn't exist

if (!dir.exists(here("data/_raw"))) {dir.create(here("data/_raw"))  }

Get diabetes data

# Fetch dataset

url <- 
  "https://archive.ics.uci.edu/ml/machine-learning-databases/00296/dataset_diabetes.zip"
temp <- tempfile()


download.file(url, temp)

# Unzip files directly into the 'data' directory

unzip(temp, exdir = here("data"))

# The data lies in a subfolder, whichi we would like to remove

path_a <- file.path(here("data"), "dataset_diabetes")
path_b <- file.path(here("data"), "_raw")

# Get the list of files with their full paths from 'path_a'
my_files <- list.files(path_a, full.names = TRUE)

# Copy files to 'path_b'
file.copy(from = my_files, to = path_b, overwrite = TRUE)
[1] TRUE TRUE
# Delete the now-empty folder
unlink(path_a, recursive = TRUE)

Get ICD9 data

# Fetch data
url_ICD9 <- "https://www.cms.gov/medicare/coding/icd9providerdiagnosticcodes/downloads/icd-9-cm-v32-master-descriptions.zip"

temp <- tempfile()

download.file(url_ICD9, temp)

# Unzip files directly into the '_raw' directory
unzip(temp, exdir = here("data/_raw"))

# The zip-folder also contains data on surgaries, which is not relevent here. The unused files are removed 
unlink(here("data/_raw/CMS32_DESC_LONG_SG.txt"))
unlink(here("data/_raw/CMS32_DESC_SHORT_SG.txt"))
unlink(here("data/_raw/CMS32_DESC_LONG_SHORT_SG.xlSX"))
unlink(here("data/_raw/CMS32_DESC_LONG_SHORT_DX.xlsx"))

Clean data

The four different files are then combined, and different types of NA values are combined.

Load libraries

suppressWarnings({
library("tidyverse")
library("here")
library("broom")
})

Clean the meta data

suppressWarnings({
#Data is loaded without defined column names
meta_data <- read_csv(here("data/_raw/IDs_mapping.csv"),
                      na = "", 
                      col_names = c("type_id", "description"), 
                      show_col_types = FALSE)


# Remove NA columns and add a column based on admission type, that is numeric. The NA values in this, will be the names of the type of metadata the id describes. Also adds a meta_type column, which has the type in the cells that are NAs in the numeric type_id_as_num
meta_data <- meta_data |> 
  na.omit() |> 
  mutate(type_id, 
         type_id_as_num = as.numeric(type_id)) |> 
  mutate(meta_type = ifelse(is.na(type_id_as_num), 
                            type_id, 
                            NA))

#Fills meta_type with the last non-NA cell 
meta_data <- meta_data |> 
    fill(meta_type)

#Removes the rows containing the names, drops admission_type_is_as_num
meta_data_clean <- meta_data |> 
  filter(!is.na(type_id_as_num)) |> 
  select(!type_id) |> 
  rename(type_id = type_id_as_num)

})

admission_type <- meta_data_clean |> 
  filter(meta_type == "admission_type_id") |> 
  select(-meta_type) |> 
  mutate(description = case_when(description %in% c("NULL", "Not Mapped", "Not Available") ~ NA, 
                                 TRUE ~ description))

discharge_disposition <- meta_data_clean |> 
  filter(meta_type == "discharge_disposition_id") |> 
  select(-meta_type) |> 
  mutate(description = case_when(description %in% c("NULL", "Not Mapped", "Unknown/Invalid") ~ NA, 
                                 TRUE ~ description))

admission_source <- meta_data_clean |> 
  filter(meta_type == "admission_source_id") |> 
  select(-meta_type) |> 
  mutate(description = case_when(description %in% c("NULL", "Not Mapped", "Unknown/Invalid", "Not Available") ~ NA, 
                                 TRUE ~ description))

Clean ICD9 data

The data is downloaded here as the newest version from 2014:

ICD-9-CM Diagnosis and Procedure Codes: Abbreviated and Full Code Titles | CMS

From the files the short and long ds are chosen (sg are surgical and dx are diagnoses).

When running the 01_load.qmd, all the relevant files should be downloaded and placed in data/_raw

# First the data will be loaded into a dataframe of one column
ICD9_short <- read_delim(here("data/_raw/CMS32_DESC_SHORT_DX.txt"), 
                 col_names = "data", 
                 show_col_types = FALSE, 
                 delim = "\n")

ICD9_long <- read_delim(here("data/_raw/CMS32_DESC_LONG_DX.txt"), 
                 col_names = "data", 
                 show_col_types = FALSE, 
                 delim = "\n")

#The long dataset includes é and è in for example Ménière's disease. To deal with this, the data is converted to UTF-8
ICD9_long <- ICD9_long |> 
  mutate(data = iconv(data, 
        from = "latin1", 
        to = "UTF-8", 
        sub = "byte"))

# The first value in each row is the ID, but the delimiter is not preserved. Each row is therefore split at the first, and everything else is merged into the second column:

ICD9_short <- ICD9_short |> 
  separate(col = data,
           into = c("ID", "Description_short"), 
           extra = "merge")

ICD9_long <- ICD9_long |> 
  separate(col = data,
           into = c("ID", "Description_long"), 
           extra = "merge")

# The long and short dataframe, should have identical IDs, however in case there are some differences, we want all data, and therefore use full_join
ICD9 <- full_join(ICD9_short, 
                 ICD9_long, 
                 by = "ID")

suppressWarnings({
ICD9 <- ICD9 |> 
  mutate(ID = ifelse(str_starts(ID, "[VE]"),
                     ID,
                     as.double(str_c(substr(ID,1,3),
                           ".",
                           substr(ID,4,length(ID))))))
})

Clean diabetes data

diabetes_data <- read_csv(here("data/_raw/diabetic_data.csv"), 
                          na = c("?", "None"), 
                          show_col_types = FALSE)

#Change NO in readmitted column to No
diabetes_data <- diabetes_data |> 
  mutate(readmitted, 
         readmitted = str_replace(string = readmitted, 
                                  "NO", 
                                  "No"))

#Change Ch in change column to Yes
diabetes_data <- diabetes_data |> 
  mutate(change, 
         change = str_replace(string = change, 
                                  "Ch", 
                                  "Yes"))

Join information from metadata to diabetes data

full_data <- diabetes_data |> 
  left_join(admission_type,
            join_by('admission_type_id' == 'type_id'), 
            relationship = "many-to-one") |>
  rename(admission_type = description) |> 
  relocate(admission_type, .after = admission_type_id) |> 
  select(-admission_type_id)

full_data <- full_data |> 
  left_join(admission_source,
            join_by('admission_source_id' == 'type_id'), 
            relationship = "many-to-one") |>
  rename(admission_source = description) |> 
  relocate(admission_source, .after = admission_source_id) |> 
  select(-admission_source_id)

full_data <- full_data |> 
  left_join(discharge_disposition,
            join_by('discharge_disposition_id' == 'type_id'), 
            relationship = "many-to-one") |>
  rename(discharge_disposition = description) |> 
  relocate(discharge_disposition, .after = discharge_disposition_id) |> 
  select(-discharge_disposition_id)

Join ICD9 to clean data

full_data <- full_data |> 
  left_join(ICD9, 
            join_by('diag_1' == 'ID'), 
            relationship = "many-to-one", 
            na_matches = "never") |> 
  rename(diag_1_short = Description_short, 
          diag_1_long = Description_long) |> 
  mutate(diag_1_short = case_when(is.na(diag_1_short) ~ diag_1,
                                  !is.na(diag_1_short) ~ diag_1_short)) |> 
  mutate(diag_1_long = case_when(is.na(diag_1_long) ~ diag_1,
                                 !is.na(diag_1_long) ~ diag_1_long))

full_data <- full_data |> 
  left_join(ICD9, 
            join_by('diag_2' == 'ID'), 
            relationship = "many-to-one", 
            na_matches = "never") |> 
  rename(diag_2_short = Description_short, 
         diag_2_long = Description_long)|> 
  mutate(diag_2_short = case_when(is.na(diag_2_short) ~ diag_2,
                                  !is.na(diag_2_short) ~ diag_2_short)) |> 
  mutate(diag_2_long = case_when(is.na(diag_2_long) ~ diag_2,
                                  !is.na(diag_2_long) ~ diag_2_long))

full_data <- full_data |> 
  left_join(ICD9, 
            join_by('diag_3' == 'ID'), 
            relationship = "many-to-one", 
            na_matches = "never") |> 
  rename(diag_3_short = Description_short,
         diag_3_long = Description_long)|> 
  mutate(diag_3_short = case_when(is.na(diag_3_short) ~ diag_3,
                                  !is.na(diag_3_short) ~ diag_3_short)) |> 
  mutate(diag_3_long = case_when(is.na(diag_3_long) ~ diag_3,
                                  !is.na(diag_3_long) ~ diag_3_long))


full_data <- full_data |> 
  relocate(c(diag_1_short, 
             diag_2_short, 
             diag_3_short, 
             diag_1_long, 
             diag_2_long, 
             diag_3_long), .after = diag_1) |> 
  select(-diag_1, -diag_2, -diag_3)

Output clean data

write_csv(full_data, here("data/02_dat_clean.csv"))

Augment data

Two columns are created, one for the amount of encounters per patient and one for the total amount of days the patient has spend in the hospital within the dataset.

Load libraries

library("here")
library("tidyverse")

Load data

clean_data <- read_csv(here("data/02_dat_clean.csv")) 
Rows: 101766 Columns: 53
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (43): race, gender, age, weight, admission_type, discharge_disposition, ...
dbl (10): encounter_id, patient_nbr, time_in_hospital, num_lab_procedures, n...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Encounters in dataset

Add columns that signifies the amount of encounters one patient has in the data set.

clean_data <- clean_data |> 
  count(patient_nbr) |> 
  right_join(clean_data,
             join_by(patient_nbr)) |> 
  rename("nr_encounters" = n) |> 
  relocate(c(patient_nbr, nr_encounters), 
           .after = encounter_id)

Total time in hospital

Add column for total amount of days spent in hospital

clean_data <- clean_data |> 
  group_by(patient_nbr) |> 
  summarise(sum(time_in_hospital)) |> 
  right_join(clean_data,
             join_by(patient_nbr)) |> 
  rename("total_time_hospital" = "sum(time_in_hospital)") |> 
  relocate(total_time_hospital, 
           .after = time_in_hospital)

Write csv

clean_data <- clean_data |> 
  relocate(patient_nbr, 
           .after = encounter_id)
write_csv(clean_data, here("data/03_dat_aug.csv"))
write_csv(clean_data, here("R/Shiny_app/03_dat_aug.csv"))

Description

Different key statistics of the data are found.

Load libraries

library("here")
library("tidyverse")

Load data

aug_data <- read_csv(here("data/03_dat_aug.csv")) 
Rows: 101766 Columns: 55
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (43): race, gender, age, weight, admission_type, discharge_disposition, ...
dbl (12): encounter_id, patient_nbr, nr_encounters, time_in_hospital, total_...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Total amount of encounters:

aug_data |> 
  select(encounter_id) |> 
  count()
# A tibble: 1 × 1
       n
   <int>
1 101766

Total amount of patients:

aug_data |> 
  distinct(patient_nbr) |> 
  count()
# A tibble: 1 × 1
      n
  <int>
1 71518

Patient with most time spent in hospital

max_time <- aug_data |> 
  summarise(max(total_time_hospital)) |> 
  pull()

aug_data |> 
  distinct(patient_nbr, total_time_hospital) |> 
  filter(total_time_hospital == max_time)
# A tibble: 1 × 2
  patient_nbr total_time_hospital
        <dbl>               <dbl>
1    84428613                 180

Patient with most encounters

max_enc <- aug_data |> 
  summarise(max(nr_encounters)) |> 
  pull()

aug_data |> 
  distinct(patient_nbr, nr_encounters) |> 
  filter(nr_encounters == max_enc)
# A tibble: 1 × 2
  patient_nbr nr_encounters
        <dbl>         <dbl>
1    88785891            40

Patient on most amount of medication

max_med <- aug_data |> 
  summarise(max(num_medications)) |> 
  pull()

aug_data |> 
  distinct(patient_nbr, num_medications) |> 
  filter(num_medications == max_med)
# A tibble: 1 × 2
  patient_nbr num_medications
        <dbl>           <dbl>
1    24189597              81

Patients with most amount diagnoses

max_dia <- aug_data |> 
  summarise(max(number_diagnoses)) |> 
  pull()

aug_data |> 
  distinct(patient_nbr, number_diagnoses) |> 
  filter(number_diagnoses == max_dia)
# A tibble: 45 × 2
   patient_nbr number_diagnoses
         <dbl>            <dbl>
 1    30577455               16
 2    31059126               16
 3    37748088               16
 4    38158101               16
 5    40595031               16
 6    40634091               16
 7    40851090               16
 8    41015448               16
 9    41164047               16
10    41306769               16
# ℹ 35 more rows

Age distribution

aug_data |> 
  group_by(age) |> 
  count()
# A tibble: 10 × 2
# Groups:   age [10]
   age          n
   <chr>    <int>
 1 [0-10)     161
 2 [10-20)    691
 3 [20-30)   1657
 4 [30-40)   3775
 5 [40-50)   9685
 6 [50-60)  17256
 7 [60-70)  22483
 8 [70-80)  26068
 9 [80-90)  17197
10 [90-100)  2793

Gender distribution

aug_data |> 
  distinct(patient_nbr, gender) |> 
  group_by(gender) |> 
  count()
# A tibble: 3 × 2
# Groups:   gender [3]
  gender              n
  <chr>           <int>
1 Female          38026
2 Male            33492
3 Unknown/Invalid     3

Race distribution

aug_data |>
  distinct(patient_nbr, race) |> 
  group_by(race) |> 
  count()
# A tibble: 6 × 2
# Groups:   race [6]
  race                n
  <chr>           <int>
1 AfricanAmerican 12932
2 Asian             517
3 Caucasian       53601
4 Hispanic         1534
5 Other            1209
6 <NA>             1977

Visualizations

The data is visualized to discover trends.

Libraries

library(tidyverse)
library(here)

Load data

data <- read.csv(here("data/03_dat_aug.csv"))

Age distribution

data |>
  group_by(age, gender) |>
  summarise(average_diagnoses = mean(number_diagnoses), .groups = "drop") |>
ggplot(aes(x = age,
                y = average_diagnoses,
                fill = gender)) +
  geom_col(position = "dodge") +
  labs(x = "Age Group",
       y = "Average number of diagnoses",
       fill = "Gender",
       title = "Average number of diagnoses for all age groups and genders per encounter") +
  scale_fill_manual(values=c('hotpink',
                             'cornflowerblue',
                             'grey35')) 

Age distribution, shown by race

data |>
  mutate(race = ifelse(is.na(race),'Other',race)) |>
  mutate(race = ifelse(race == 'Other', 'Other/Unspecified', race)) |>
  group_by(age, 
           gender, 
           race) |> 
  summarize(average_diagnoses = mean(number_diagnoses), .groups = "drop") |> 
  ggplot(mapping = aes(x = age,
                y = average_diagnoses,
                fill = gender)) +
  geom_col(position = "dodge", 
           alpha = 1.0) +
  facet_wrap(~ race) +
  labs(
    x = "Age Group",
    y = "Average number of diagnoses",
    fill = "Gender",
    title = 
      "Average number of diagnoses for all age groups and genders,\nfacet wrapped for race") +
    scale_fill_manual(values=c('hotpink',
                             'cornflowerblue',
                             'grey35')) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 60, hjust =1),
          legend.position = 'inside',
          legend.position.inside = c(0.85,0.15))

Race/gender distribution

data |>
  select(patient_nbr, race, gender) |>
  distinct(patient_nbr, .keep_all = TRUE) |>
  mutate(race = ifelse(is.na(race), "Other", race)) |>
  group_by(race, gender) |>
  summarise(count = n(), .groups = "drop") |>
  mutate(text_color = ifelse(count > 4000, "white", 
                       ifelse(gender == "Female", "hotpink", 
                        ifelse(gender == "Male", "cornflowerblue", "grey35")))) |>
  ggplot(aes(x = race, y = count, fill = gender)) +
  geom_bar(stat = "identity", position = "stack") +
  geom_text(
    aes(label = count, color = text_color),
    data = data |> 
      select(patient_nbr, race, gender) |>
      distinct(patient_nbr, .keep_all = TRUE) |>
      mutate(race = ifelse(is.na(race), "Other", race)) |>
      group_by(race, gender) |>
      summarise(count = n(), .groups = "drop") |>
      filter(count > 4000) |>
      mutate(text_color = ifelse(count > 4000, "white", 
                           ifelse(gender == "Female", "hotpink", 
                            ifelse(gender == "Male", "cornflowerblue", "grey35")))),
    position = position_stack(vjust = 0.5),
    size = 4
  ) +
  geom_text(
    aes(label = count, color = text_color),
    data = data |> 
      select(patient_nbr, race, gender) |>
      distinct(patient_nbr, .keep_all = TRUE) |>
      mutate(race = ifelse(is.na(race), "Other", race)) |>
      group_by(race, gender) |>
      summarise(count = n(), .groups = "drop") |>
      filter(count <= 4000) |>
      mutate(text_color = ifelse(count > 4000, "white", 
                           ifelse(gender == "Female", "hotpink",
                            ifelse(gender == "Male", "cornflowerblue", "grey35")))),
    position = position_dodge2(width = 0.8),
    size = 4,
    vjust = -3
  ) +
  scale_color_identity() +
  labs(
    x = "Race", 
    y = "Count", 
    fill = "Gender",
    title = str_c("Gender/race distribution of dataset, total no. of patients: ", 
                  data |> 
                    select(patient_nbr) |>
                    group_by(patient_nbr) |>
                    summarise(count = n()) |>
                    count()
                  )
    ) +
  scale_fill_manual(values = c("hotpink", "cornflowerblue", "grey35")) +
  theme_minimal() +
  theme(legend.position = "bottom")

Discharge disposition

#Finding the 8 most frequent discharge disposition for any admission type
top_8_discharge_dispositions <- data |>
  group_by(discharge_disposition)|> 
  summarize(n = n()) |> #Summarizing the total amount of occurences of each discharge type
  arrange(desc(n)) |> #Arranging from high -> low 
  head(8) |> 
  select(discharge_disposition) |>
  pull() #Converting into a vector

data |> 
  select(discharge_disposition, age) |> 
  filter(discharge_disposition %in% top_8_discharge_dispositions) |> #Filtering for only the cases where the discharge type was in the top 8 
  group_by(discharge_disposition, age) |> 
  summarize(n = n(),.groups = 'keep') |> 
  drop_na(discharge_disposition) |>
  ggplot(
    mapping = aes(x = discharge_disposition,
                  y = n,
                  fill = age)
  ) +
  geom_col(position = "dodge") +
  scale_fill_viridis_d(option = "viridis") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 35)) +
  labs(
    x = "Discharge disposition",
    y = "Count",
    fill = "Age group",
    title = "Discharge disposition for all age groups for all admission types"
  ) +
    theme_minimal() +
    theme(
      axis.text.x = element_text(angle = 60, hjust =1)
    )

Only for emergency admission type

#Finding the top 8 for the casses related to emergency admissions only
top_8_discharge_dispositions <- data |>
  filter(admission_type == "Emergency") |> 
  group_by(discharge_disposition)|> 
  summarize(n = n()) |> 
  arrange(desc(n)) |> 
  head(8) |> 
  select(discharge_disposition) |> 
  pull()

data |>
  filter(admission_type == "Emergency") |> 
  select(discharge_disposition, age) |> 
  filter(discharge_disposition %in% top_8_discharge_dispositions) |> 
  group_by(discharge_disposition, age) |> 
  summarize(n = n(),.groups = 'keep') |> 
  drop_na(discharge_disposition) |>
  ggplot(
    mapping = aes(x = discharge_disposition,
                  y = n,
                  fill = age)
  ) +
  geom_col(position = "dodge") +
  scale_fill_viridis_d(option = "viridis") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 35)) +
  labs(
    x = "Discharge disposition",
    y = "Count",
    fill = "Age group",
    title = "Discharge disposition for all age groups for emergency admissions"
  ) +
    theme_minimal() +
    theme(
      axis.text.x = element_text(angle = 60, hjust =1)
    )

Only urgent admission type:

top_8_discharge_dispositions <- data |>
  filter(admission_type == "Urgent") |> 
  group_by(discharge_disposition)|> 
  summarize(n = n()) |> 
  arrange(desc(n)) |> 
  head(8) |> 
  select(discharge_disposition) |> 
  pull()

data |>
  filter(admission_type == "Urgent") |> 
  select(discharge_disposition, age) |> 
  filter(discharge_disposition %in% top_8_discharge_dispositions) |> 
  group_by(discharge_disposition, age) |> 
  summarize(n = n(),.groups = 'keep') |> 
  drop_na(discharge_disposition) |>
  ggplot(
    mapping = aes(x = discharge_disposition,
                  y = n,
                  fill = age)
  ) +
  geom_col(position = "dodge") +
  scale_fill_viridis_d(option = "viridis") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 35)) +
  labs(
    x = "Discharge disposition",
    y = "Count",
    fill = "Age group",
    title = "Discharge disposition for all age groups for urgent admissions"
  ) +
    theme_minimal() +
    theme(
      axis.text.x = element_text(angle = 60, hjust =1)
    )

For elective admission type

top_8_discharge_dispositions <- data |>
  filter(admission_type == "Elective") |> 
  group_by(discharge_disposition)|> 
  summarize(n = n()) |> 
  arrange(desc(n)) |> 
  head(8) |> 
  select(discharge_disposition) |> 
  pull()

data |>
  filter(admission_type == "Elective") |> 
  select(discharge_disposition, age) |> 
  filter(discharge_disposition %in% top_8_discharge_dispositions) |> 
  group_by(discharge_disposition, age) |> 
  summarize(n = n(),.groups = 'keep') |> 
  drop_na(discharge_disposition) |>
  ggplot(
    mapping = aes(x = discharge_disposition,
                  y = n,
                  fill = age)
  ) +
  geom_col(position = "dodge") +

  scale_fill_viridis_d(option = "viridis") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 35)) +
  labs(
    x = "Discharge disposition",
    y = "Count",
    fill = "Age group",
    title = "Discharge disposition for all age groups for elective admissions"
  ) +
    theme_minimal() +
    theme(
      axis.text.x = element_text(angle = 60, hjust =1)
    )

Number of visits per patient

data |>
  select(patient_nbr,gender) |>
  group_by(patient_nbr) |>
  mutate(number_of_visits = n()) |>
  distinct(patient_nbr,.keep_all = TRUE) |>
  ungroup() |>
  select(gender, number_of_visits) |>
  group_by(number_of_visits,gender) |>
  summarise(count = n(), .groups = 'drop') |>
  ggplot(aes(x = number_of_visits, y = gender)) + 
  geom_count(aes(color = gender, 
                 size = count,  
                 shape = factor(count)),
      position = position_jitter(height = 0.2)) +
  labs(x = "Number of visits",
       y = "Gender",
       title = "Number of visits by gender") +
  scale_color_manual(values = c("hotpink", "cornflowerblue", "grey35")) +
  scale_size_continuous(
    range = c(2, 10), 
    trans = 'sqrt',  
    breaks = c(1, 5, 100, 1000, 10000, 20000),
    labels = c('1', '5', '100', '1000', '10000', '20000+')) +
  scale_shape_manual(values=c(17,rep(16, 28))) +  
  theme_minimal() +
  theme(legend.position = 'bottom',
        legend.justification = "center") +
        guides( color = guide_none(),
          size = guide_legend(title = "Counts", 
                        override.aes = list(shape = c(17,rep(16,5))),
                        ncol = 6),  
          shape = guide_none()) +
  coord_flip()

Diagnosis vs. visits

data |>
  filter(nr_encounters == 1) |>
  select(patient_nbr, number_diagnoses, gender) |>
  group_by(patient_nbr) |>
  mutate(visit_num = row_number()) |>
  ungroup() |>
  pivot_wider(
    names_from = visit_num,                
    values_from = number_diagnoses,        
    names_prefix = "visit_",          
    values_fill = list(number_diagnoses = NA)) |>
  ggplot(aes(x=visit_1,fill=gender)) +
  geom_boxplot() +
  labs(title = 'Amount of diagnoses for people with only 1 visit',
       x ='Number of diagnoses',
       fill = 'Gender') + 
  scale_fill_manual(values=c('hotpink',
                             'cornflowerblue',
                             'grey35')) 

data |>
  filter(nr_encounters == 2) |>
  select(patient_nbr, number_diagnoses,encounter_id) |>
  group_by(patient_nbr) |>
  arrange(encounter_id, .by_group = TRUE) |>
  mutate(visit_num = row_number()) |>
  ungroup() |>
  select(-encounter_id) |>
  pivot_wider(
    names_from = visit_num,                
    values_from = number_diagnoses,        
    names_prefix = "visit_",          
    values_fill = list(number_diagnoses = NA)) |>
  pivot_longer(cols = -patient_nbr,
               names_to = 'visit') |>
  ggplot(aes(x=value,fill=visit)) +
  geom_boxplot() +
  labs(title = str_c('Amount of diagnoses for people with 2 visits (',
                      data |>
                       filter(nr_encounters == 2) |>
                       pull(patient_nbr) |>
                       unique() |>
                       length(),
                     ' people)'),
       x ='Number of diagnoses',
       fill = 'Visit number') +
  scale_fill_manual(
    values = c("visit_1" = "lightblue3", "visit_2" = "seagreen3"), 
    labels = c("1", "2") 
  )

data |>
  filter(nr_encounters == 3) |>
  select(patient_nbr, number_diagnoses, encounter_id) |>
  group_by(patient_nbr) |>
  arrange(encounter_id, .by_group = TRUE) |>
  mutate(visit_num = row_number()) |>
  ungroup() |>
  select(-encounter_id) |>
  pivot_wider(
    names_from = visit_num,                
    values_from = number_diagnoses,        
    names_prefix = "visit_",          
    values_fill = list(number_diagnoses = NA)) |>
  pivot_longer(cols = -patient_nbr,
               names_to = 'visit') |>
  ggplot(aes(x=value,fill=visit)) +
  geom_boxplot() +
  labs(title = str_c('Amount of diagnoses for people with 3 visits (',
                     data |>
                       filter(nr_encounters == 3) |>
                       pull(patient_nbr) |>
                       unique() |>
                       length(),
                     ' people)'),
       x ='Number of diagnoses',
       fill = 'Visit number') +
scale_fill_manual(
    values = c("visit_1" = "lightblue3", "visit_2" = "seagreen3","visit_3" = 'darkolivegreen'), 
    labels = c("1", "2","3") 
  )

Plot of average of number of diagnosis per patient per maximum visit:

data |>
  select(patient_nbr, number_diagnoses,encounter_id) |>
  group_by(patient_nbr) |>
  arrange(encounter_id, .by_group = TRUE) |>
  mutate(visit_num = row_number()) |>
  ungroup() |>
  select(-encounter_id) |>
  pivot_wider(
    names_from = visit_num,                
    values_from = number_diagnoses,        
    names_prefix = "visit_",          
    values_fill = list(number_diagnoses = NA)) |>
  pivot_longer(cols = c(-patient_nbr),
               names_to = 'visit',
               values_to = 'diagnosis') |>
  drop_na(diagnosis) |>
  group_by(visit) |>
  summarise(mean_diagnosis = mean(diagnosis), no_people = n()) |>
  mutate(visit = str_extract(visit, "\\d+"),
         visit = factor(visit,levels=sort(as.numeric(visit)))) |>
  ggplot(aes(x=visit,y=mean_diagnosis)) +
  geom_point(aes(color=no_people),size=3) +
  scale_color_gradient2(low='hotpink',
                     mid='orange',
                     high='cornflowerblue',
                     midpoint=12000,
                     limits=c(2,NA),
                     n.breaks=5,
                     transform = 'sqrt',
                     na.value = 'forestgreen',
                     guide = guide_colorbar(title = "Amount of people")
                     ) +
  theme_minimal() +
  theme(axis.text.x=element_text(angle=90,hjust=1)) + 
  labs(title = 'Average number of diagnoses per visit by patient',
       subtitle = '- Green dots signify 1 person',
       x = 'Number of visit by patients',
       y = 'Average number of diagnoses') 

Most common primary diagnoses compared to number of visits

data |>
  select(patient_nbr, encounter_id, nr_encounters, diag_1_long) |>
  group_by(diagnosis = diag_1_long) |>
  summarise(count = n(), .groups = "drop") |>
  top_n(10, count) |>
  inner_join(data, by = c("diagnosis" = "diag_1_long")) |>
  group_by(nr_encounters, diagnosis) |>
  summarise(diagnosis_count = n(), .groups = "drop") |>
  ggplot(aes(x = nr_encounters, y = diagnosis, fill = diagnosis_count)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "orange", high = "purple") +
  scale_y_discrete(labels = function(y) str_wrap(y, width = 45)) +
  labs(
    x = "Number of visits",
    y = "",
    fill = "Amount of patients",
    title = "Top 10 most common primary diagnosis per visit"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 0, hjust = 1),
        legend.position = 'bottom',
        legend.background = element_rect(linewidth = 0.5, color = 'grey'),
        legend.text = element_text(angle=20,hjust=1),
        legend.justification = 'left',
        legend.title.position = 'top',
        plot.title = element_text(hjust = -3))

Gender discrepancy for patients

data |>
  group_by(patient_nbr) |>
  filter(n_distinct(gender) > 1) |>
  arrange(encounter_id, .by_group = TRUE)
# A tibble: 7 × 55
# Groups:   patient_nbr [3]
  encounter_id patient_nbr nr_encounters race      gender age     weight
         <int>       <int>         <int> <chr>     <chr>  <chr>   <chr> 
1    220872936    40867677             3 <NA>      Male   [40-50) <NA>  
2    232356060    40867677             3 <NA>      Female [40-50) <NA>  
3    291975978    40867677             3 <NA>      Male   [40-50) <NA>  
4     20729328    55500588             2 Caucasian Female [70-80) <NA>  
5     96312942    55500588             2 Caucasian Male   [80-90) <NA>  
6    183845022   109210482             2 Caucasian Female [50-60) <NA>  
7    186533256   109210482             2 Caucasian Male   [50-60) <NA>  
# ℹ 48 more variables: admission_type <chr>, discharge_disposition <chr>,
#   admission_source <chr>, time_in_hospital <int>, total_time_hospital <int>,
#   payer_code <chr>, medical_specialty <chr>, num_lab_procedures <int>,
#   num_procedures <int>, num_medications <int>, number_outpatient <int>,
#   number_emergency <int>, number_inpatient <int>, diag_1_short <chr>,
#   diag_2_short <chr>, diag_3_short <chr>, diag_1_long <chr>,
#   diag_2_long <chr>, diag_3_long <chr>, number_diagnoses <int>, …

Diabetes types

diabetes_count <- data |>
  select(patient_nbr,ends_with('_long')) |>
  pivot_longer(cols = ends_with('_long'),
               values_to = 'all_diagnoses',
               names_to = 'diag_order') |>
  select(-diag_order) |>
  group_by(patient_nbr) |>
  filter(str_detect(all_diagnoses,'iabetes')) |>
  distinct() |>
  group_by(all_diagnoses) |>
  summarise(count=n()) 
diabetes_count |>
  mutate(type = ifelse(str_detect(all_diagnoses,'type I '),'Type I','Type II or unspecified'),
         uncontrolled = ifelse(str_detect(all_diagnoses,', uncontrolled'), 
                               'Uncontrolled', 'Not specified as uncontrolled'),
         no_complication = ifelse(str_detect(all_diagnoses, 'without mention of complication'),
                                  'No complication','Complications')) |>
  mutate(complication_type = ifelse(no_complication == 'Complications',
                                    str_to_sentence(str_extract(all_diagnoses,'(?<=with ).*(?=, *type)')),
                                    'None')) |>
  mutate(complication_type = ifelse(is.na(complication_type),'Pregnancy complications',complication_type)) |>
  mutate(complication_type = factor(complication_type, 
                                    levels = c(unique(complication_type[complication_type != 'None']), 'None'))) |>
  ggplot(aes(x = uncontrolled, y=count, fill = complication_type)) +
  geom_bar(stat='identity',) +
  facet_wrap(~type) +
  labs(title = 'Diabetes managment and complications',
       subtitle = str_c('For patients with diabetes as primary or secondary diagnosis (',
                        sum(diabetes_count$count),' patients)'),
       x = 'Controllation of disease', 
       y = 'Number of instances', 
       fill = 'Complications') +
  theme_minimal() +
  scale_x_discrete(labels = function(x) str_wrap(x,width = 17)) +
  scale_fill_viridis_d(option = 'viridis',direction=-1) 

Only for people with complications

diabetes_count |>
  mutate(type = ifelse(str_detect(all_diagnoses,'type I '),'Type I','Type II or unspecified'),
         uncontrolled = ifelse(str_detect(all_diagnoses,', uncontrolled'), 
                               'Uncontrolled', 'Not specified as uncontrolled'),
         no_complication = ifelse(str_detect(all_diagnoses, 'without mention of complication'),
                                  'No complication','Complications')) |>
  mutate(complication_type = ifelse(no_complication == 'Complications',
                                    str_to_sentence(str_extract(all_diagnoses,'(?<=with ).*(?=, *type)')),
                                    'None')) |>
  mutate(complication_type = ifelse(is.na(complication_type),'Pregnancy complications',complication_type)) |>
  filter(!complication_type == 'None') |>
  ggplot(aes(x = uncontrolled, y=count, fill = complication_type)) +
  geom_bar(stat='identity',) +
  facet_wrap(~type) +
  labs(title = 'Diabetes managment and complications',
       subtitle = str_c(
         'For patients with diabetes as primary or secondary diagnosis\n(only for cases with complications: ',
                        diabetes_count |>
                          filter(!str_detect(all_diagnoses, 
                                             'without mention of complication')) |>
                          select(count) |>
                          sum(),
                        ' patients)'),
       x = 'Controllation of disease', 
       y = 'Number of instances', 
       fill = 'Complications') +
  theme_minimal() +
  scale_fill_viridis_d(option = 'plasma',direction=-1) + 
  scale_x_discrete(labels = function(x) str_wrap(x,width = 17))

PCA

A PCA is conducted.

Import dataset

library(tidyverse)
library(broom)
library(cowplot)
library(ggplot2)
library(here)
aug_data <- read_csv(here("data/03_dat_aug.csv"))

Start of the PCA

Looking at the data in PC coordinates:

numeric_data <- aug_data |> 
  select(where(is.numeric)) |> 
  na.omit() |> 
  select(where(~ var(., na.rm = TRUE) > 0))

pca_fit <- numeric_data |>  
  prcomp(scale = TRUE) 

data_clean <- aug_data |> 
  filter(row_number() %in% rownames(numeric_data))

pca_plot <- pca_fit |> 
  augment(data_clean) |>  
  ggplot(aes(.fittedPC1, .fittedPC2)) + 
  geom_point(size = 1.5) +
  theme_half_open(12) +
  background_grid()
pca_plot

Now we extract the rotation matrix

pca_fit |> 
  tidy(matrix = "rotation")
# A tibble: 144 × 3
   column          PC    value
   <chr>        <dbl>    <dbl>
 1 encounter_id     1  0.0920 
 2 encounter_id     2 -0.113  
 3 encounter_id     3  0.616  
 4 encounter_id     4 -0.00514
 5 encounter_id     5  0.200  
 6 encounter_id     6 -0.0289 
 7 encounter_id     7  0.166  
 8 encounter_id     8  0.0607 
 9 encounter_id     9  0.504  
10 encounter_id    10 -0.327  
# ℹ 134 more rows

and plot it:

# arrow for plotting
arrow_style <- arrow(
  angle = 20, ends = "first", type = "closed", length = grid::unit(8, "pt")
)

# rotation matrix
pca_fit |> 
  tidy(matrix = "rotation") |> 
  pivot_wider(names_from = "PC",
              names_prefix = "PC",
              values_from = "value") |> 
  ggplot(aes(PC1, PC2)) +
  geom_segment(xend = 0,
               yend = 0,
               arrow = arrow_style) +
  geom_text(
    aes(label = column),
    hjust = 1, nudge_x = -0.02, 
    color = "#904C2F"
  ) +
  xlim(-1.25, .5) +
  ylim(-.5, 1) +
  coord_fixed() + 
  theme_minimal_hgrid(12)
Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_segment()`).
Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_text()`).

Look at the variance

pca_fit |> 
  tidy(matrix = "eigenvalues")
# A tibble: 12 × 4
      PC std.dev percent cumulative
   <dbl>   <dbl>   <dbl>      <dbl>
 1     1   1.65  0.228        0.228
 2     2   1.39  0.162        0.389
 3     3   1.30  0.141        0.530
 4     4   0.992 0.0820       0.612
 5     5   0.973 0.0788       0.691
 6     6   0.931 0.0723       0.763
 7     7   0.854 0.0607       0.824
 8     8   0.804 0.0538       0.877
 9     9   0.721 0.0433       0.921
10    10   0.682 0.0387       0.959
11    11   0.647 0.0349       0.994
12    12   0.262 0.00572      1    

And now the plot:

pca_bar_plot <- pca_fit |> 
  tidy(matrix = "eigenvalues") |> 
  ggplot(aes(PC, percent)) +
  geom_col(fill = "#56B4E9",
           alpha = 0.8) +
  scale_x_continuous(breaks = 1:9) +
  scale_y_continuous(
    labels = scales::percent_format(),
    expand = expansion(mult = c(0, 0.01))
  ) +
  theme_minimal_hgrid(12)
pca_bar_plot

It is seen that 22.5% of the variance is explained by PC1. The drop of is after PC3.

Exporting the key plots

ggsave(
  filename = here("results", "06_pca_plot.png"),
  plot = pca_plot, 
  width = 8, 
  height = 6, 
  dpi = 300
)

ggsave(
  filename = here("results", "06_pca_bar_plot.png"),  
  plot = pca_bar_plot, 
  width = 8, 
  height = 6, 
  dpi = 300
)

We can then extract what each Principle component is made up of:

# Extract the loadings
loadings <- pca_fit$rotation |> 
  as_tibble(rownames = "Variable")

print(loadings)
# A tibble: 12 × 13
   Variable        PC1     PC2     PC3      PC4     PC5      PC6     PC7     PC8
   <chr>         <dbl>   <dbl>   <dbl>    <dbl>   <dbl>    <dbl>   <dbl>   <dbl>
 1 encounter_id 0.0920 -0.113   0.616  -0.00514  0.200  -0.0289   0.166   0.0607
 2 patient_nbr  0.0647 -0.124   0.604  -0.0877   0.151  -0.00526  0.332   0.148 
 3 nr_encounte… 0.507   0.283  -0.0516  0.0250   0.128  -0.143    0.0778 -0.0949
 4 time_in_hos… 0.242  -0.429  -0.227  -0.174   -0.0544 -0.0330  -0.0464  0.635 
 5 total_time_… 0.537   0.136  -0.130  -0.0491   0.112  -0.187    0.0628  0.119 
 6 num_lab_pro… 0.134  -0.341  -0.126  -0.543   -0.278   0.279    0.404  -0.465 
 7 num_procedu… 0.0368 -0.385  -0.133   0.659    0.252   0.0222   0.187  -0.358 
 8 num_medicat… 0.221  -0.505  -0.107   0.215    0.0194  0.0150   0.0189  0.121 
 9 number_outp… 0.118   0.0191  0.214   0.335   -0.865  -0.260    0.0860  0.0233
10 number_emer… 0.233   0.157   0.118   0.218   -0.124   0.886   -0.164   0.152 
11 number_inpa… 0.448   0.230  -0.0274  0.0124   0.0406 -0.0253   0.0706 -0.229 
12 number_diag… 0.214  -0.303   0.281  -0.154   -0.0167 -0.102   -0.782  -0.335 
# ℹ 4 more variables: PC9 <dbl>, PC10 <dbl>, PC11 <dbl>, PC12 <dbl>

Using k-means to cluster patients

pca_data <- pca_fit |> 
  augment(data_clean)

pca_data <- pca_data |> 
  mutate(cluster = kmeans(select(pca_data, .fittedPC1:.fittedPC3), centers = 3)$cluster) |>
  mutate(cluster = as.factor(cluster))

pca_clustered <- pca_data |> 
  ggplot(aes(x = .fittedPC1, y = .fittedPC2, color = cluster)) +
  labs(x = "PC1",
       y = "PC2",
       title = "Clustering of patient groups based on PCA") +
  geom_point() +
  theme_minimal()
pca_clustered

Exporting the key plot again

ggsave(
  filename = here("results", "06_pca_clustered.jpg"),
  plot = pca_clustered , 
  width = 8, 
  height = 6, 
  dpi = 300
)

Summary

  • PC1: Likely represents overall patient interaction with the healthcare system.

  • PC2: Likely represents treatment intensity or complexity.

  • PC3: Likely represents inpatient care or hospitalization frequency.

Shiny

A shiny app is created to discover correlation between readmission and medications.

library(“shiny”) # Load necessary scripts source(“ui.R”) source(“server.R”)

Run the Shiny app

shinyApp(ui = ui, server = server)

library(shiny) df <- read.csv(“03_dat_aug.csv”)

Define UI

ui <- fluidPage( titlePanel(“Diabetes Patient Readmissions”),

sidebarLayout( sidebarPanel( # File upload to load the data fileInput(“data_file”, “Upload Dataset”, accept = “.csv”),

  # Filter for readmission status
  selectInput(
    "readmission_filter",
    "Filter by Readmission Status:",
    choices = c("All", unique(df$readmitted)),
    selected = "All"
  ),
  selectInput(
    "age",
    "Filter by age group:",
    choices = c("All", sort(unique(df$age))),
    selected = "All"
  ),
  selectInput(
    "race",
    "Filter by race:",
    choices = c("All", sort(unique(df$race))),
    selected = "All"
  ),
  selectInput(
    "gender",
    "Filter by gender",
    choices = c("All", sort(unique(df$gender))),
    selected = "All"
  ),
  
  # Checkboxes for medications
  checkboxGroupInput(
    "medication_filter",
    "Filter by Medications:",
    choices = c(
      "Metformin", "Repaglinide", "Nateglinide", "Chlorpropamide",
      "Glimepiride", "Acetohexamide", "Glipizide", "Glyburide",
      "Tolbutamide", "Pioglitazone", "Rosiglitazone", "Acarbose",
      "Miglitol", "Troglitazone", "Tolazamide", "Examide",
      "Sitagliptin", "Insulin", "Glyburide-Metformin", 
      "Glipizide-Metformin", "Glimepiride-Pioglitazone",
      "Metformin-Rosiglitazone", "Metformin-Pioglitazone"
    ),
    selected = NULL
  )
),

mainPanel(
  # Display a bar plot
  plotOutput("readmissionPlot")
)

) )

library(shiny)

Define server logic

server <- function(input, output) {

# Reactive data loading dataset <- reactive({ req(input\(data_file) # Ensure a file is uploaded load_data(input\)data_file$datapath) # Call the load_data function })

# Reactive filtering filtered_data <- reactive({ data <- dataset()

# Filter by readmission status
if (input$readmission_filter != "All") {
  data <- data[data$Readmitted == input$readmission_filter, ]
}

# Filter by medications
if (!is.null(input$medication_filter)) {
  for (med in input$medication_filter) {
    if (med %in% colnames(data)) {  # Only filter if column exists
      data <- data[data[[med]] == 1, ]
    } else {
      warning(paste("Medication column", med, "not found in dataset"))
    }
  }
}

if (nrow(data) == 0) {
  return(NULL)  # Handle empty results gracefully
}

data

})

# Render the bar plot output$readmissionPlot <- renderPlot({ data <- filtered_data() if (is.null(data)) { plot.new() text(0.5, 0.5, “No data to display after filtering”, cex = 1.5) return() }

# Count the number of patients by readmission status
readmission_counts <- table(data$Readmitted)

# Create a bar plot
barplot(
  readmission_counts,
  main = "Readmission Status",
  ylab = "Number of Patients",
  xlab = "Readmission",
  col = c("skyblue", "orange", "lightgreen")
)

}) }

Render all scripts

Load data

quarto::quarto_render("01_load.qmd")


processing file: 01_load.qmd

  |                                                          
  |                                                    |   0%
  |                                                          
  |......                                              |  11%                  
  |                                                          
  |............                                        |  22% [unnamed-chunk-1]
  |                                                          
  |.................                                   |  33%                  
  |                                                          
  |.......................                             |  44% [unnamed-chunk-2]
  |                                                          
  |.............................                       |  56%                  
  |                                                          
  |...................................                 |  67% [unnamed-chunk-3]trying URL 'https://archive.ics.uci.edu/ml/machine-learning-databases/00296/dataset_diabetes.zip'
downloaded 3.2 MB


  |                                                          
  |........................................            |  78%                  
  |                                                          
  |..............................................      |  89% [unnamed-chunk-4]trying URL 'https://www.cms.gov/medicare/coding/icd9providerdiagnosticcodes/downloads/icd-9-cm-v32-master-descriptions.zip'
Content type 'application/zip' length 1079497 bytes (1.0 MB)
==================================================
downloaded 1.0 MB


  |                                                          
  |....................................................| 100%                  
                                                                                                            
output file: 01_load.knit.md

pandoc 
  to: html
  output-file: 01_load.html
  standalone: true
  embed-resources: true
  section-divs: true
  html-math-method: mathjax
  wrap: none
  default-image-extension: png
  
metadata
  document-css: false
  link-citations: true
  date-format: long
  lang: en
  title: 'Load: Group 03'
  editor: visual
  
Output created: ../results/R/01_load.html

Clean Data

quarto::quarto_render("02_clean.qmd")


processing file: 02_clean.qmd

  |                                                          
  |                                                    |   0%
  |                                                          
  |...                                                 |   7%                  
  |                                                          
  |.......                                             |  13% [unnamed-chunk-1]
  |                                                          
  |..........                                          |  20%                  
  |                                                          
  |..............                                      |  27% [unnamed-chunk-2]
  |                                                          
  |.................                                   |  33%                  
  |                                                          
  |.....................                               |  40% [unnamed-chunk-3]
  |                                                          
  |........................                            |  47%                  
  |                                                          
  |............................                        |  53% [unnamed-chunk-4]
  |                                                          
  |...............................                     |  60%                  
  |                                                          
  |...................................                 |  67% [unnamed-chunk-5]
  |                                                          
  |......................................              |  73%                  
  |                                                          
  |..........................................          |  80% [unnamed-chunk-6]
  |                                                          
  |.............................................       |  87%                  
  |                                                          
  |.................................................   |  93% [unnamed-chunk-7]
  |                                                          
  |....................................................| 100%                  
                                                                                                            
output file: 02_clean.knit.md

pandoc 
  to: html
  output-file: 02_clean.html
  standalone: true
  embed-resources: true
  section-divs: true
  html-math-method: mathjax
  wrap: none
  default-image-extension: png
  
metadata
  document-css: false
  link-citations: true
  date-format: long
  lang: en
  title: 'Clean: Group 03'
  editor: visual
  
Output created: ../results/R/02_clean.html

Augment Data

quarto::quarto_render("03_augment.qmd")


processing file: 03_augment.qmd

  |                                                          
  |                                                    |   0%
  |                                                          
  |.....                                               |   9%                  
  |                                                          
  |.........                                           |  18% [unnamed-chunk-1]
  |                                                          
  |..............                                      |  27%                  
  |                                                          
  |...................                                 |  36% [unnamed-chunk-2]
  |                                                          
  |........................                            |  45%                  
  |                                                          
  |............................                        |  55% [unnamed-chunk-3]
  |                                                          
  |.................................                   |  64%                  
  |                                                          
  |......................................              |  73% [unnamed-chunk-4]
  |                                                          
  |...........................................         |  82%                  
  |                                                          
  |...............................................     |  91% [unnamed-chunk-5]
  |                                                          
  |....................................................| 100%                  
                                                                                                            
output file: 03_augment.knit.md

pandoc 
  to: html
  output-file: 03_augment.html
  standalone: true
  embed-resources: true
  section-divs: true
  html-math-method: mathjax
  wrap: none
  default-image-extension: png
  
metadata
  document-css: false
  link-citations: true
  date-format: long
  lang: en
  title: 'Augment: Group 03'
  editor: visual
  
Output created: ../results/R/03_augment.html

Description

quarto::quarto_render("04_description.qmd")


processing file: 04_description.qmd

  |                                                         
  |                                                   |   0%
  |                                                         
  |..                                                 |   4%                   
  |                                                         
  |....                                               |   9% [unnamed-chunk-1] 
  |                                                         
  |.......                                            |  13%                   
  |                                                         
  |.........                                          |  17% [unnamed-chunk-2] 
  |                                                         
  |...........                                        |  22%                   
  |                                                         
  |.............                                      |  26% [unnamed-chunk-3] 
  |                                                         
  |................                                   |  30%                   
  |                                                         
  |..................                                 |  35% [unnamed-chunk-4] 
  |                                                         
  |....................                               |  39%                   
  |                                                         
  |......................                             |  43% [unnamed-chunk-5] 
  |                                                         
  |........................                           |  48%                   
  |                                                         
  |...........................                        |  52% [unnamed-chunk-6] 
  |                                                         
  |.............................                      |  57%                   
  |                                                         
  |...............................                    |  61% [unnamed-chunk-7] 
  |                                                         
  |.................................                  |  65%                   
  |                                                         
  |...................................                |  70% [unnamed-chunk-8] 
  |                                                         
  |......................................             |  74%                   
  |                                                         
  |........................................           |  78% [unnamed-chunk-9] 
  |                                                         
  |..........................................         |  83%                   
  |                                                         
  |............................................       |  87% [unnamed-chunk-10]
  |                                                         
  |...............................................    |  91%                   
  |                                                         
  |.................................................  |  96% [unnamed-chunk-11]
  |                                                         
  |...................................................| 100%                   
                                                                                                             
output file: 04_description.knit.md

pandoc 
  to: html
  output-file: 04_description.html
  standalone: true
  embed-resources: true
  section-divs: true
  html-math-method: mathjax
  wrap: none
  default-image-extension: png
  
metadata
  document-css: false
  link-citations: true
  date-format: long
  lang: en
  title: 'Description: Group 03'
  editor: visual
  
Output created: ../results/R/04_description.html

Visualizations

quarto::quarto_render("05_visualizations.qmd")


processing file: 05_visualizations.qmd

  |                
  |          |   0%
  |                
  |          |   3%                                                                        
  |                
  |.         |   5% [unnamed-chunk-1]                                                      
  |                
  |.         |   8%                                                                        
  |                
  |.         |  10% [unnamed-chunk-2]                                                      
  |                
  |.         |  13%                                                                        
  |                
  |..        |  15% [average_number_of_diagnoses_for_all]                                  
  |                
  |..        |  18%                                                                        
  |                
  |..        |  21% [average_diagnoses_for_all_races]                                      
  |                
  |..        |  23%                                                                        
  |                
  |...       |  26% [race_gender_distribution]                                             
  |                
  |...       |  28%                                                                        
  |                
  |...       |  31% [discharge_disposition_for_all_admission_types]                        
  |                
  |...       |  33%                                                                        
  |                
  |....      |  36% [discharge_disposition_for_emergency_admissions]                       
  |                
  |....      |  38%                                                                        
  |                
  |....      |  41% [discharge_disposition_for_urgent_admissions]                          
  |                
  |....      |  44%                                                                        
  |                
  |.....     |  46% [discharge_disposition_for_elective_admissions]                        
  |                
  |.....     |  49%                                                                        
  |                
  |.....     |  51% [number_of_visits_per_patient]                                         
  |                
  |.....     |  54%                                                                        
  |                
  |......    |  56% [diagnosis_for_1_visit]                                                
  |                
  |......    |  59%                                                                        
  |                
  |......    |  62% [diagnosis_for_2_visits]                                               
  |                
  |......    |  64%                                                                        
  |                
  |.......   |  67% [diagnosis_for_3_visits]                                               
  |                
  |.......   |  69%                                                                        
  |                
  |.......   |  72% [average_number_of_diagnoses_per_visit_by_patient]                     
  |                
  |.......   |  74%                                                                        
  |                
  |........  |  77% [most_common_primary_diagnoses_per_visit]                              
  |                
  |........  |  79%                                                                        
  |                
  |........  |  82% [unnamed-chunk-3]                                                      
  |                
  |........  |  85%                                                                        
  |                
  |......... |  87% [unnamed-chunk-4]                                                      
  |                
  |......... |  90%                                                                        
  |                
  |......... |  92% [diabetes_management_and_complications_for_all_patients]               
  |                
  |......... |  95%                                                                        
  |                
  |..........|  97% [diabetes_management_and_complications_for_patients_with_complications]
  |                
  |..........| 100%                                                                        
                                                                                                                                                                  
output file: 05_visualizations.knit.md

pandoc 
  to: html
  output-file: 05_visualizations.html
  standalone: true
  embed-resources: true
  section-divs: true
  html-math-method: mathjax
  wrap: none
  default-image-extension: png
  
metadata
  document-css: false
  link-citations: true
  date-format: long
  lang: en
  title: 'Visulizations: Group 03'
  editor: visual
  
Output created: ../results/R/05_visualizations.html

PCA analysis

quarto::quarto_render("06_analysis_PCA.qmd")


processing file: 06_analysis_PCA.qmd

  |                                                     
  |                                               |   0%
  |                                                     
  |..                                             |   5%                       
  |                                                     
  |....                                           |  10% [unnamed-chunk-1]     
  |                                                     
  |.......                                        |  14%                       
  |                                                     
  |.........                                      |  19% [pc_coordinate_plot]  
  |                                                     
  |...........                                    |  24%                       
  |                                                     
  |.............                                  |  29% [unnamed-chunk-2]     
  |                                                     
  |................                               |  33%                       
  |                                                     
  |..................                             |  38% [rotation_matrix_plot]
  |                                                     
  |....................                           |  43%                       
  |                                                     
  |......................                         |  48% [unnamed-chunk-3]     
  |                                                     
  |.........................                      |  52%                       
  |                                                     
  |...........................                    |  57% [variance_plot]       
  |                                                     
  |.............................                  |  62%                       
  |                                                     
  |...............................                |  67% [unnamed-chunk-4]     
  |                                                     
  |..................................             |  71%                       
  |                                                     
  |....................................           |  76% [unnamed-chunk-5]     
  |                                                     
  |......................................         |  81%                       
  |                                                     
  |........................................       |  86% [unnamed-chunk-6]     
  |                                                     
  |...........................................    |  90%                       
  |                                                     
  |.............................................  |  95% [unnamed-chunk-7]     
  |                                                     
  |...............................................| 100%                       
                                                                                                                 
output file: 06_analysis_PCA.knit.md

pandoc 
  to: html
  output-file: 06_analysis_PCA.html
  standalone: true
  embed-resources: true
  section-divs: true
  html-math-method: mathjax
  wrap: none
  default-image-extension: png
  
metadata
  document-css: false
  link-citations: true
  date-format: long
  lang: en
  title: 'PCA: Group 03'
  editor: visual
  
Output created: ../results/R/06_analysis_PCA.html